clear all

**set working directory to Data folder
use "retrostudy2 data.dta"

*ip status
gen nonus=1
replace nonus=0 if countryname=="United States"
gen blockfull=block
replace blockfull=3 if nonus==1 & block==0
label define blocklab 0 "Valid" 1 "VPS User" 2 "VPS Uncertain" 3 "Foreign"
label values blockfull blocklab
recode blockfull 0=1 1/3=0, gen(valid)

*attention checks
gen check1=0
replace check1=1 if ac1_1==0 & ac1_2==100
replace check1=. if ac1_1==. | ac1_2==.
gen check2=0
replace check2=1 if ac2_1==100 & ac2_2==0
replace check2=. if ac2_1==. | ac2_2==.
gen checkfails=check1+check2
recode checkfails 0=2 2=0
tab checkfails 

*check based on reported city
gen failcity=citycheck
recode failcity 1/10=1 11=0
tab failcity

*age check
gen born=yrborn+7
recode age 1989=29 1993=25
gen failage=born-age
recode failage 0/1=0 *=1
replace failage=. if born==. | age==.
tab failage

*data quality index
alpha fail* susp*, item gen(suspindex) casewise
recode suspindex .2/.8=1, gen(suspdi)
gen susp5 = suspindex*5

*pid and ideology
gen PID = party
recode PID 1=2 2=6 3/4=4 -99=.
recode PID 4=3 if partylean==-1
recode PID 4=5 if partylean==1
recode PID 2=1 if partystrong==1
recode PID 6=7 if partystrong==1
recode ideology 1=5 2=4 4=2 5=1


************
* Analysis *
************

tab blockfull

*attention check failure by ip status
proportion checkfails if blockfull==0
proportion checkfails if blockfull==1
proportion checkfails if blockfull==2
proportion checkfails if blockfull==3

tab checkfails blockfull if (blockfull==0 | blockfull==1), col chi2
tab checkfails blockfull if (blockfull==0 | blockfull==3), col chi2
tab checkfails blockfull if (blockfull==1 | blockfull==3), col chi2

cd "../Output Figures"

*left-hand panel of figure 4 - attention check failures by ip status
reg checkfails ibn.blockfull, nocons
est store c1
coefplot c1, vert ylab(0(.5)1.5) label(1 "block=1") recast(bar) barwidth(.5) color(gs10) citop ///
	coeflabels(0.blockfull="Valid" 1.blockfull=`""VPS" "User""' 2.blockfull=`""VPS" "Uncertain""' 3.blockfull="Foreign ") ///
	ytitle("Number of Failed Attention Checks") subtitle("Attention Checks") saving(attention.gph, replace)

*valid respondents by attention check failure
tab valid
tab valid if checkfails<2
tab valid if checkfails==0
	
*data quality checks
tab suspdi

proportion suspdi if blockfull==0
est store m1
proportion suspdi if blockfull==1
est store m2
proportion suspdi if blockfull==2
est store m3
proportion suspdi if blockfull==3
est store m4
coefplot m1 m2 m3 m4, keep(1) vert citype(logit) legend(off) recast(bar) barwidth(.1) color(gs10) citop ciopts(lcolor(gs5)) ///
	xlab(.7 "Valid" .9 `""VPS" "User""' 1.1 `""VPS" "Uncertain""' 1.3 "Foreign") ///
	ylab(0(.25).75) ytitle("Proportion Flagged for Low-Quality Data") ///
	subtitle("Data Quality Checks") saving(quality.gph, replace)

graph combine attention.gph quality.gph, saving(figure4.gph, replace)

*appendix a6. analysis of individual quality checks
sum failcity failage susplocation susptask suspcomments
corr failcity failage susplocation susptask suspcomments

*appendix a6. distribution of quality check failures
hist susp5, by(blockfull, rows(1) note("")  title("Number of Failed Quality Checks by VPS Status (Study 2)")) d xlab(0(1)4) xtitle("Number of Failed Quality Checks") saving(checksbystatus2.gph, replace)

*table a2. relationship between PID & ideology
bysort blockfull: corr PID ideology
reg PID i.blockfull##c.ideology
